"""
RAG Management API Routes

Provides endpoints for managing RAG indexing of library documents:
- Configure embedding models
- Index documents
- Get RAG statistics
- Bulk operations with progress tracking
"""

from flask import (
    Blueprint,
    jsonify,
    request,
    Response,
    render_template,
    session,
    stream_with_context,
)
from loguru import logger
import glob
import json
import uuid
import time
import threading
import queue
from datetime import datetime, UTC
from pathlib import Path
from typing import Optional

from ...web.auth.decorators import login_required
from ...utilities.db_utils import get_settings_manager
from ..services.library_rag_service import LibraryRAGService
from ...security.path_validator import PathValidator
from ..utils import handle_api_error
from ...database.models.library import (
    Document,
    Collection,
    DocumentCollection,
    RAGIndex,
    SourceType,
    EmbeddingProvider,
)
from ...database.models.queue import TaskMetadata
from ...web.utils.rate_limiter import limiter

rag_bp = Blueprint("rag", __name__, url_prefix="/library")


def get_rag_service(collection_id: Optional[str] = None) -> LibraryRAGService:
    """
    Get RAG service instance with appropriate settings.

    If collection_id is provided:
    - Uses collection's stored settings if they exist
    - Uses current defaults for new collections (and stores them)

    If no collection_id:
    - Uses current default settings
    """
    from ...database.session_context import get_user_db_session

    settings = get_settings_manager()
    username = session["username"]

    # Get current default settings
    default_embedding_model = settings.get_setting(
        "local_search_embedding_model", "all-MiniLM-L6-v2"
    )
    default_embedding_provider = settings.get_setting(
        "local_search_embedding_provider", "sentence_transformers"
    )
    default_chunk_size = int(
        settings.get_setting("local_search_chunk_size", 1000)
    )
    default_chunk_overlap = int(
        settings.get_setting("local_search_chunk_overlap", 200)
    )

    # Get new advanced configuration settings (Issue #1054)
    import json

    default_splitter_type = settings.get_setting(
        "local_search_splitter_type", "recursive"
    )
    default_text_separators = settings.get_setting(
        "local_search_text_separators", '["\n\n", "\n", ". ", " ", ""]'
    )
    # Parse JSON string to list
    if isinstance(default_text_separators, str):
        try:
            default_text_separators = json.loads(default_text_separators)
        except json.JSONDecodeError:
            logger.warning(
                f"Invalid JSON for local_search_text_separators setting: {default_text_separators!r}. "
                "Using default separators."
            )
            default_text_separators = ["\n\n", "\n", ". ", " ", ""]
    default_distance_metric = settings.get_setting(
        "local_search_distance_metric", "cosine"
    )
    # Ensure normalize_vectors is a proper boolean (settings may return string)
    raw_normalize = settings.get_setting("local_search_normalize_vectors", True)
    if isinstance(raw_normalize, str):
        default_normalize_vectors = raw_normalize.lower() in (
            "true",
            "1",
            "yes",
        )
    else:
        default_normalize_vectors = bool(raw_normalize)
    default_index_type = settings.get_setting("local_search_index_type", "flat")

    # If collection_id provided, check for stored settings
    if collection_id:
        with get_user_db_session(username) as db_session:
            collection = (
                db_session.query(Collection).filter_by(id=collection_id).first()
            )

            if collection and collection.embedding_model:
                # Use collection's stored settings
                logger.info(
                    f"Using stored settings for collection {collection_id}: "
                    f"{collection.embedding_model_type.value}/{collection.embedding_model}"
                )
                # Handle normalize_vectors - may be stored as string in some cases
                coll_normalize = collection.normalize_vectors
                if coll_normalize is not None:
                    if isinstance(coll_normalize, str):
                        coll_normalize = coll_normalize.lower() in (
                            "true",
                            "1",
                            "yes",
                        )
                    else:
                        coll_normalize = bool(coll_normalize)
                else:
                    coll_normalize = default_normalize_vectors

                return LibraryRAGService(
                    username=username,
                    embedding_model=collection.embedding_model,
                    embedding_provider=collection.embedding_model_type.value,
                    chunk_size=collection.chunk_size or default_chunk_size,
                    chunk_overlap=collection.chunk_overlap
                    or default_chunk_overlap,
                    splitter_type=collection.splitter_type
                    or default_splitter_type,
                    text_separators=collection.text_separators
                    or default_text_separators,
                    distance_metric=collection.distance_metric
                    or default_distance_metric,
                    normalize_vectors=coll_normalize,
                    index_type=collection.index_type or default_index_type,
                )
            elif collection:
                # New collection - use defaults and store them
                logger.info(
                    f"New collection {collection_id}, using and storing default settings"
                )

                # Create service with defaults
                service = LibraryRAGService(
                    username=username,
                    embedding_model=default_embedding_model,
                    embedding_provider=default_embedding_provider,
                    chunk_size=default_chunk_size,
                    chunk_overlap=default_chunk_overlap,
                    splitter_type=default_splitter_type,
                    text_separators=default_text_separators,
                    distance_metric=default_distance_metric,
                    normalize_vectors=default_normalize_vectors,
                    index_type=default_index_type,
                )

                # Store settings on collection (will be done during indexing)
                # Note: We don't store here because we don't have embedding_dimension yet
                # It will be stored in index_collection when first document is indexed

                return service

    # No collection or fallback - use current defaults
    return LibraryRAGService(
        username=username,
        embedding_model=default_embedding_model,
        embedding_provider=default_embedding_provider,
        chunk_size=default_chunk_size,
        chunk_overlap=default_chunk_overlap,
        splitter_type=default_splitter_type,
        text_separators=default_text_separators,
        distance_metric=default_distance_metric,
        normalize_vectors=default_normalize_vectors,
        index_type=default_index_type,
    )


# Page Routes


@rag_bp.route("/embedding-settings")
@login_required
def embedding_settings_page():
    """Render the Embedding Settings page."""
    return render_template(
        "pages/embedding_settings.html", active_page="embedding-settings"
    )


@rag_bp.route("/document/<string:document_id>/chunks")
@login_required
def view_document_chunks(document_id):
    """View all chunks for a document across all collections."""
    from ...database.session_context import get_user_db_session
    from ...database.models.library import DocumentChunk

    username = session.get("username")

    with get_user_db_session(username) as db_session:
        # Get document info
        document = db_session.query(Document).filter_by(id=document_id).first()

        if not document:
            return "Document not found", 404

        # Get all chunks for this document
        chunks = (
            db_session.query(DocumentChunk)
            .filter(DocumentChunk.source_id == document_id)
            .order_by(DocumentChunk.collection_name, DocumentChunk.chunk_index)
            .all()
        )

        # Group chunks by collection
        chunks_by_collection = {}
        for chunk in chunks:
            coll_name = chunk.collection_name
            if coll_name not in chunks_by_collection:
                # Get collection display name
                collection_id = coll_name.replace("collection_", "")
                collection = (
                    db_session.query(Collection)
                    .filter_by(id=collection_id)
                    .first()
                )
                chunks_by_collection[coll_name] = {
                    "name": collection.name if collection else coll_name,
                    "id": collection_id,
                    "chunks": [],
                }

            chunks_by_collection[coll_name]["chunks"].append(
                {
                    "id": chunk.id,
                    "index": chunk.chunk_index,
                    "text": chunk.chunk_text,
                    "word_count": chunk.word_count,
                    "start_char": chunk.start_char,
                    "end_char": chunk.end_char,
                    "embedding_model": chunk.embedding_model,
                    "embedding_model_type": chunk.embedding_model_type.value
                    if chunk.embedding_model_type
                    else None,
                    "embedding_dimension": chunk.embedding_dimension,
                    "created_at": chunk.created_at,
                }
            )

        return render_template(
            "pages/document_chunks.html",
            document=document,
            chunks_by_collection=chunks_by_collection,
            total_chunks=len(chunks),
        )


@rag_bp.route("/collections")
@login_required
def collections_page():
    """Render the Collections page."""
    return render_template("pages/collections.html", active_page="collections")


@rag_bp.route("/collections/<string:collection_id>")
@login_required
def collection_details_page(collection_id):
    """Render the Collection Details page."""
    return render_template(
        "pages/collection_details.html",
        active_page="collections",
        collection_id=collection_id,
    )


@rag_bp.route("/collections/<string:collection_id>/upload")
@login_required
def collection_upload_page(collection_id):
    """Render the Collection Upload page."""
    # Get the upload PDF storage setting
    settings = get_settings_manager()
    upload_pdf_storage = settings.get_setting(
        "research_library.upload_pdf_storage", "none"
    )
    # Only allow valid values for uploads (no filesystem)
    if upload_pdf_storage not in ("database", "none"):
        upload_pdf_storage = "none"

    return render_template(
        "pages/collection_upload.html",
        active_page="collections",
        collection_id=collection_id,
        collection_name=None,  # Could fetch from DB if needed
        upload_pdf_storage=upload_pdf_storage,
    )


@rag_bp.route("/collections/create")
@login_required
def collection_create_page():
    """Render the Create Collection page."""
    return render_template(
        "pages/collection_create.html", active_page="collections"
    )


# API Routes


@rag_bp.route("/api/rag/settings", methods=["GET"])
@login_required
def get_current_settings():
    """Get current RAG configuration from settings."""
    import json as json_lib

    try:
        settings = get_settings_manager()

        # Get text separators and parse if needed
        text_separators = settings.get_setting(
            "local_search_text_separators", '["\n\n", "\n", ". ", " ", ""]'
        )
        if isinstance(text_separators, str):
            try:
                text_separators = json_lib.loads(text_separators)
            except json_lib.JSONDecodeError:
                logger.warning(
                    f"Invalid JSON for local_search_text_separators setting: {text_separators!r}. "
                    "Using default separators."
                )
                text_separators = ["\n\n", "\n", ". ", " ", ""]

        normalize_vectors = settings.get_setting(
            "local_search_normalize_vectors", True
        )

        return jsonify(
            {
                "success": True,
                "settings": {
                    "embedding_provider": settings.get_setting(
                        "local_search_embedding_provider",
                        "sentence_transformers",
                    ),
                    "embedding_model": settings.get_setting(
                        "local_search_embedding_model", "all-MiniLM-L6-v2"
                    ),
                    "chunk_size": settings.get_setting(
                        "local_search_chunk_size", 1000
                    ),
                    "chunk_overlap": settings.get_setting(
                        "local_search_chunk_overlap", 200
                    ),
                    "splitter_type": settings.get_setting(
                        "local_search_splitter_type", "recursive"
                    ),
                    "text_separators": text_separators,
                    "distance_metric": settings.get_setting(
                        "local_search_distance_metric", "cosine"
                    ),
                    "normalize_vectors": normalize_vectors,
                    "index_type": settings.get_setting(
                        "local_search_index_type", "flat"
                    ),
                },
            }
        )
    except Exception as e:
        return handle_api_error("getting RAG settings", e)


@rag_bp.route("/api/rag/test-embedding", methods=["POST"])
@login_required
def test_embedding():
    """Test an embedding configuration by generating a test embedding."""

    try:
        data = request.json
        provider = data.get("provider")
        model = data.get("model")
        test_text = data.get("test_text", "This is a test.")

        if not provider or not model:
            return jsonify(
                {"success": False, "error": "Provider and model are required"}
            ), 400

        # Import embedding functions
        from ...embeddings.embeddings_config import (
            get_embedding_function,
        )

        logger.info(
            f"Testing embedding with provider={provider}, model={model}"
        )

        # Get embedding function with the specified configuration
        start_time = time.time()
        embedding_func = get_embedding_function(
            provider=provider,
            model_name=model,
        )

        # Generate test embedding
        embedding = embedding_func([test_text])[0]
        response_time_ms = int((time.time() - start_time) * 1000)

        # Get embedding dimension
        dimension = len(embedding) if hasattr(embedding, "__len__") else None

        return jsonify(
            {
                "success": True,
                "dimension": dimension,
                "response_time_ms": response_time_ms,
                "provider": provider,
                "model": model,
            }
        )

    except Exception as e:
        return handle_api_error("testing embedding", e)


@rag_bp.route("/api/rag/models", methods=["GET"])
@login_required
def get_available_models():
    """Get list of available embedding providers and models."""
    try:
        from ...embeddings.embeddings_config import _get_provider_classes

        # Get current settings for providers
        settings = get_settings_manager()
        settings_snapshot = (
            settings.get_all_settings()
            if hasattr(settings, "get_all_settings")
            else {}
        )

        # Get provider classes
        provider_classes = _get_provider_classes()

        # Provider display names
        provider_labels = {
            "sentence_transformers": "Sentence Transformers (Local)",
            "ollama": "Ollama (Local)",
            "openai": "OpenAI API",
        }

        # Get provider options and models by looping through providers
        provider_options = []
        providers = {}

        for provider_key, provider_class in provider_classes.items():
            if provider_class.is_available(settings_snapshot):
                # Add provider option
                provider_options.append(
                    {
                        "value": provider_key,
                        "label": provider_labels.get(
                            provider_key, provider_key
                        ),
                    }
                )

                # Get models for this provider
                models = provider_class.get_available_models(settings_snapshot)
                providers[provider_key] = [
                    {
                        "value": m["value"],
                        "label": m["label"],
                        "provider": provider_key,
                    }
                    for m in models
                ]

        return jsonify(
            {
                "success": True,
                "provider_options": provider_options,
                "providers": providers,
            }
        )
    except Exception as e:
        return handle_api_error("getting available models", e)


@rag_bp.route("/api/rag/info", methods=["GET"])
@login_required
def get_index_info():
    """Get information about the current RAG index."""
    from ...database.library_init import get_default_library_id

    try:
        # Get collection_id from request or use default Library collection
        collection_id = request.args.get("collection_id")
        if not collection_id:
            collection_id = get_default_library_id(session["username"])

        logger.info(
            f"Getting RAG index info for collection_id: {collection_id}"
        )

        rag_service = get_rag_service(collection_id)
        info = rag_service.get_current_index_info(collection_id)

        if info is None:
            logger.info(
                f"No RAG index found for collection_id: {collection_id}"
            )
            return jsonify(
                {"success": True, "info": None, "message": "No index found"}
            )

        logger.info(f"Found RAG index for collection_id: {collection_id}")
        return jsonify({"success": True, "info": info})
    except Exception as e:
        return handle_api_error("getting index info", e)


@rag_bp.route("/api/rag/stats", methods=["GET"])
@login_required
def get_rag_stats():
    """Get RAG statistics for a collection."""
    from ...database.library_init import get_default_library_id

    try:
        # Get collection_id from request or use default Library collection
        collection_id = request.args.get("collection_id")
        if not collection_id:
            collection_id = get_default_library_id(session["username"])

        rag_service = get_rag_service(collection_id)
        stats = rag_service.get_rag_stats(collection_id)

        return jsonify({"success": True, "stats": stats})
    except Exception as e:
        return handle_api_error("getting RAG stats", e)


@rag_bp.route("/api/rag/index-document", methods=["POST"])
@login_required
def index_document():
    """Index a single document in a collection."""
    from ...database.library_init import get_default_library_id

    try:
        data = request.get_json()
        text_doc_id = data.get("text_doc_id")
        force_reindex = data.get("force_reindex", False)
        collection_id = data.get("collection_id")

        if not text_doc_id:
            return jsonify(
                {"success": False, "error": "text_doc_id is required"}
            ), 400

        # Get collection_id from request or use default Library collection
        if not collection_id:
            collection_id = get_default_library_id(session["username"])

        rag_service = get_rag_service(collection_id)
        result = rag_service.index_document(
            text_doc_id, collection_id, force_reindex
        )

        if result["status"] == "error":
            return jsonify(
                {"success": False, "error": result.get("error")}
            ), 400

        return jsonify({"success": True, "result": result})
    except Exception as e:
        return handle_api_error(f"indexing document {text_doc_id}", e)


@rag_bp.route("/api/rag/remove-document", methods=["POST"])
@login_required
def remove_document():
    """Remove a document from RAG in a collection."""
    from ...database.library_init import get_default_library_id

    try:
        data = request.get_json()
        text_doc_id = data.get("text_doc_id")
        collection_id = data.get("collection_id")

        if not text_doc_id:
            return jsonify(
                {"success": False, "error": "text_doc_id is required"}
            ), 400

        # Get collection_id from request or use default Library collection
        if not collection_id:
            collection_id = get_default_library_id(session["username"])

        rag_service = get_rag_service(collection_id)
        result = rag_service.remove_document_from_rag(
            text_doc_id, collection_id
        )

        if result["status"] == "error":
            return jsonify(
                {"success": False, "error": result.get("error")}
            ), 400

        return jsonify({"success": True, "result": result})
    except Exception as e:
        return handle_api_error(f"removing document {text_doc_id}", e)


@rag_bp.route("/api/rag/index-research", methods=["POST"])
@login_required
def index_research():
    """Index all documents from a research."""
    try:
        data = request.get_json()
        research_id = data.get("research_id")
        force_reindex = data.get("force_reindex", False)

        if not research_id:
            return jsonify(
                {"success": False, "error": "research_id is required"}
            ), 400

        rag_service = get_rag_service()
        results = rag_service.index_research_documents(
            research_id, force_reindex
        )

        return jsonify({"success": True, "results": results})
    except Exception as e:
        return handle_api_error(f"indexing research {research_id}", e)


@rag_bp.route("/api/rag/index-all", methods=["GET"])
@login_required
def index_all():
    """Index all documents in a collection with Server-Sent Events progress."""
    from ...database.session_context import get_user_db_session
    from ...database.library_init import get_default_library_id

    force_reindex = request.args.get("force_reindex", "false").lower() == "true"
    username = session["username"]

    # Get collection_id from request or use default Library collection
    collection_id = request.args.get("collection_id")
    if not collection_id:
        collection_id = get_default_library_id(username)

    logger.info(
        f"Starting index-all for collection_id: {collection_id}, force_reindex: {force_reindex}"
    )

    # Create RAG service in request context before generator runs
    rag_service = get_rag_service(collection_id)

    def generate():
        """Generator function for SSE progress updates."""
        try:
            # Send initial status
            yield f"data: {json.dumps({'type': 'start', 'message': 'Starting bulk indexing...'})}\n\n"

            # Get document IDs to index from DocumentCollection
            with get_user_db_session(username) as db_session:
                # Query Document joined with DocumentCollection for the collection
                query = (
                    db_session.query(Document.id, Document.title)
                    .join(
                        DocumentCollection,
                        Document.id == DocumentCollection.document_id,
                    )
                    .filter(DocumentCollection.collection_id == collection_id)
                )

                if not force_reindex:
                    # Only index documents that haven't been indexed yet
                    query = query.filter(DocumentCollection.indexed.is_(False))

                doc_info = [(doc_id, title) for doc_id, title in query.all()]

            if not doc_info:
                yield f"data: {json.dumps({'type': 'complete', 'results': {'successful': 0, 'skipped': 0, 'failed': 0, 'message': 'No documents to index'}})}\n\n"
                return

            results = {"successful": 0, "skipped": 0, "failed": 0, "errors": []}
            total = len(doc_info)

            # Process documents in batches to optimize performance
            # Get batch size from settings
            settings = get_settings_manager()
            batch_size = int(
                settings.get_setting("rag.indexing_batch_size", 15)
            )
            processed = 0

            for i in range(0, len(doc_info), batch_size):
                batch = doc_info[i : i + batch_size]

                # Process batch with collection_id
                batch_results = rag_service.index_documents_batch(
                    batch, collection_id, force_reindex
                )

                # Process results and send progress updates
                for j, (doc_id, title) in enumerate(batch):
                    processed += 1
                    result = batch_results[doc_id]

                    # Send progress update
                    yield f"data: {json.dumps({'type': 'progress', 'current': processed, 'total': total, 'title': title, 'percent': int((processed / total) * 100)})}\n\n"

                    if result["status"] == "success":
                        results["successful"] += 1
                    elif result["status"] == "skipped":
                        results["skipped"] += 1
                    else:
                        results["failed"] += 1
                        results["errors"].append(
                            {
                                "doc_id": doc_id,
                                "title": title,
                                "error": result.get("error"),
                            }
                        )

            # Send completion status
            yield f"data: {json.dumps({'type': 'complete', 'results': results})}\n\n"

            # Log final status for debugging
            logger.info(
                f"Bulk indexing complete: {results['successful']} successful, {results['skipped']} skipped, {results['failed']} failed"
            )

        except Exception:
            logger.exception("Error in bulk indexing")
            yield f"data: {json.dumps({'type': 'error', 'error': 'An internal error occurred during indexing'})}\n\n"

    return Response(
        stream_with_context(generate()), mimetype="text/event-stream"
    )


@rag_bp.route("/api/rag/configure", methods=["POST"])
@login_required
def configure_rag():
    """
    Change RAG configuration (embedding model, chunk size, etc.).
    This will create a new index with the new configuration.
    """
    import json as json_lib

    try:
        data = request.get_json()
        embedding_model = data.get("embedding_model")
        embedding_provider = data.get("embedding_provider")
        chunk_size = data.get("chunk_size")
        chunk_overlap = data.get("chunk_overlap")
        collection_id = data.get("collection_id")

        # Get new advanced settings (with defaults)
        splitter_type = data.get("splitter_type", "recursive")
        text_separators = data.get(
            "text_separators", ["\n\n", "\n", ". ", " ", ""]
        )
        distance_metric = data.get("distance_metric", "cosine")
        normalize_vectors = data.get("normalize_vectors", True)
        index_type = data.get("index_type", "flat")

        if not all(
            [
                embedding_model,
                embedding_provider,
                chunk_size,
                chunk_overlap,
            ]
        ):
            return jsonify(
                {
                    "success": False,
                    "error": "All configuration parameters are required (embedding_model, embedding_provider, chunk_size, chunk_overlap)",
                }
            ), 400

        # Save settings to database
        settings = get_settings_manager()
        settings.set_setting("local_search_embedding_model", embedding_model)
        settings.set_setting(
            "local_search_embedding_provider", embedding_provider
        )
        settings.set_setting("local_search_chunk_size", int(chunk_size))
        settings.set_setting("local_search_chunk_overlap", int(chunk_overlap))

        # Save new advanced settings
        settings.set_setting("local_search_splitter_type", splitter_type)
        # Convert list to JSON string for storage
        if isinstance(text_separators, list):
            text_separators_str = json_lib.dumps(text_separators)
        else:
            text_separators_str = text_separators
        settings.set_setting(
            "local_search_text_separators", text_separators_str
        )
        settings.set_setting("local_search_distance_metric", distance_metric)
        settings.set_setting(
            "local_search_normalize_vectors", bool(normalize_vectors)
        )
        settings.set_setting("local_search_index_type", index_type)

        # If collection_id is provided, update that collection's configuration
        if collection_id:
            # Create new RAG service with new configuration
            new_rag_service = LibraryRAGService(
                username=session["username"],
                embedding_model=embedding_model,
                embedding_provider=embedding_provider,
                chunk_size=int(chunk_size),
                chunk_overlap=int(chunk_overlap),
                splitter_type=splitter_type,
                text_separators=text_separators
                if isinstance(text_separators, list)
                else json_lib.loads(text_separators),
                distance_metric=distance_metric,
                normalize_vectors=normalize_vectors,
                index_type=index_type,
            )

            # Get or create new index with this configuration
            rag_index = new_rag_service._get_or_create_rag_index(collection_id)

            return jsonify(
                {
                    "success": True,
                    "message": "Configuration updated for collection. You can now index documents with the new settings.",
                    "index_hash": rag_index.index_hash,
                }
            )
        else:
            # Just saving default settings without updating a specific collection
            return jsonify(
                {
                    "success": True,
                    "message": "Default embedding settings saved successfully. New collections will use these settings.",
                }
            )

    except Exception as e:
        return handle_api_error("configuring RAG", e)


@rag_bp.route("/api/rag/documents", methods=["GET"])
@login_required
def get_documents():
    """Get library documents with their RAG status for the default Library collection (paginated)."""
    from ...database.session_context import get_user_db_session
    from ...database.library_init import get_default_library_id

    try:
        # Get pagination parameters
        page = request.args.get("page", 1, type=int)
        per_page = request.args.get("per_page", 50, type=int)
        filter_type = request.args.get(
            "filter", "all"
        )  # all, indexed, unindexed

        # Validate pagination parameters
        page = max(1, page)
        per_page = min(max(10, per_page), 100)  # Limit between 10-100

        # Close current thread's session to force fresh connection
        from ...database.thread_local_session import cleanup_current_thread

        cleanup_current_thread()

        username = session["username"]

        # Get collection_id from request or use default Library collection
        collection_id = request.args.get("collection_id")
        if not collection_id:
            collection_id = get_default_library_id(username)

        logger.info(
            f"Getting documents for collection_id: {collection_id}, filter: {filter_type}, page: {page}"
        )

        with get_user_db_session(username) as db_session:
            # Expire all cached objects to ensure we get fresh data from DB
            db_session.expire_all()

            # Import RagDocumentStatus model
            from ...database.models.library import RagDocumentStatus

            # Build base query - join Document with DocumentCollection for the collection
            # LEFT JOIN with rag_document_status to check indexed status
            query = (
                db_session.query(
                    Document, DocumentCollection, RagDocumentStatus
                )
                .join(
                    DocumentCollection,
                    (DocumentCollection.document_id == Document.id)
                    & (DocumentCollection.collection_id == collection_id),
                )
                .outerjoin(
                    RagDocumentStatus,
                    (RagDocumentStatus.document_id == Document.id)
                    & (RagDocumentStatus.collection_id == collection_id),
                )
            )

            logger.debug(f"Base query for collection {collection_id}: {query}")

            # Apply filters based on rag_document_status existence
            if filter_type == "indexed":
                query = query.filter(RagDocumentStatus.document_id.isnot(None))
            elif filter_type == "unindexed":
                # Documents in collection but not indexed yet
                query = query.filter(RagDocumentStatus.document_id.is_(None))

            # Get total count before pagination
            total_count = query.count()
            logger.info(
                f"Found {total_count} total documents for collection {collection_id} with filter {filter_type}"
            )

            # Apply pagination
            results = (
                query.order_by(Document.created_at.desc())
                .limit(per_page)
                .offset((page - 1) * per_page)
                .all()
            )

            documents = [
                {
                    "id": doc.id,
                    "title": doc.title,
                    "original_url": doc.original_url,
                    "rag_indexed": rag_status is not None,
                    "chunk_count": rag_status.chunk_count if rag_status else 0,
                    "created_at": doc.created_at.isoformat()
                    if doc.created_at
                    else None,
                }
                for doc, doc_collection, rag_status in results
            ]

            # Debug logging to help diagnose indexing status issues
            indexed_count = sum(1 for d in documents if d["rag_indexed"])

            # Additional debug: check rag_document_status for this collection
            all_indexed_statuses = (
                db_session.query(RagDocumentStatus)
                .filter_by(collection_id=collection_id)
                .all()
            )
            logger.info(
                f"rag_document_status table shows: {len(all_indexed_statuses)} documents indexed for collection {collection_id}"
            )

            logger.info(
                f"Returning {len(documents)} documents on page {page}: "
                f"{indexed_count} indexed, {len(documents) - indexed_count} not indexed"
            )

        return jsonify(
            {
                "success": True,
                "documents": documents,
                "pagination": {
                    "page": page,
                    "per_page": per_page,
                    "total": total_count,
                    "pages": (total_count + per_page - 1) // per_page,
                },
            }
        )
    except Exception as e:
        return handle_api_error("getting documents", e)


@rag_bp.route("/api/rag/index-local", methods=["GET"])
@login_required
def index_local_library():
    """Index documents from a local folder with Server-Sent Events progress."""
    folder_path = request.args.get("path")
    file_patterns = request.args.get(
        "patterns", "*.pdf,*.txt,*.md,*.html"
    ).split(",")
    recursive = request.args.get("recursive", "true").lower() == "true"

    if not folder_path:
        return jsonify({"success": False, "error": "Path is required"}), 400

    # Validate and sanitize the path to prevent traversal attacks
    try:
        validated_path = PathValidator.validate_local_filesystem_path(
            folder_path
        )
        # Re-sanitize for static analyzer recognition (CodeQL)
        path = PathValidator.sanitize_for_filesystem_ops(validated_path)
    except ValueError as e:
        logger.warning(f"Path validation failed for '{folder_path}': {e}")
        return jsonify({"success": False, "error": "Invalid path"}), 400

    # Check path exists and is a directory
    if not path.exists():
        return jsonify({"success": False, "error": "Path does not exist"}), 400
    if not path.is_dir():
        return jsonify(
            {"success": False, "error": "Path is not a directory"}
        ), 400

    # Create RAG service in request context
    rag_service = get_rag_service()

    def generate():
        """Generator function for SSE progress updates."""
        try:
            # Send initial status
            yield f"data: {json.dumps({'type': 'start', 'message': f'Scanning folder: {path}'})}\n\n"

            # Find all matching files
            files_to_index = []
            for pattern in file_patterns:
                pattern = pattern.strip()
                if recursive:
                    search_pattern = str(path / "**" / pattern)
                else:
                    search_pattern = str(path / pattern)

                matching_files = glob.glob(search_pattern, recursive=recursive)
                files_to_index.extend(matching_files)

            # Remove duplicates and sort
            files_to_index = sorted(set(files_to_index))

            if not files_to_index:
                yield f"data: {json.dumps({'type': 'complete', 'results': {'successful': 0, 'skipped': 0, 'failed': 0, 'message': 'No matching files found'}})}\n\n"
                return

            results = {"successful": 0, "skipped": 0, "failed": 0, "errors": []}
            total = len(files_to_index)

            # Index each file
            for idx, file_path in enumerate(files_to_index, 1):
                file_name = Path(file_path).name

                # Send progress update
                yield f"data: {json.dumps({'type': 'progress', 'current': idx, 'total': total, 'filename': file_name, 'percent': int((idx / total) * 100)})}\n\n"

                try:
                    # Index the file directly using RAG service
                    result = rag_service.index_local_file(file_path)

                    if result.get("status") == "success":
                        results["successful"] += 1
                    elif result.get("status") == "skipped":
                        results["skipped"] += 1
                    else:
                        results["failed"] += 1
                        results["errors"].append(
                            {
                                "file": file_name,
                                "error": result.get("error", "Unknown error"),
                            }
                        )
                except Exception:
                    results["failed"] += 1
                    results["errors"].append(
                        {"file": file_name, "error": "Failed to index file"}
                    )
                    logger.exception(f"Error indexing file {file_path}")

            # Send completion status
            yield f"data: {json.dumps({'type': 'complete', 'results': results})}\n\n"

            logger.info(
                f"Local library indexing complete for {path}: "
                f"{results['successful']} successful, "
                f"{results['skipped']} skipped, "
                f"{results['failed']} failed"
            )

        except Exception:
            logger.exception("Error in local library indexing")
            yield f"data: {json.dumps({'type': 'error', 'error': 'An internal error occurred during indexing'})}\n\n"

    return Response(
        stream_with_context(generate()), mimetype="text/event-stream"
    )


# Collection Management Routes


@rag_bp.route("/api/collections", methods=["GET"])
@login_required
def get_collections():
    """Get all document collections for the current user."""
    from ...database.session_context import get_user_db_session

    try:
        username = session["username"]
        with get_user_db_session(username) as db_session:
            # No need to filter by username - each user has their own database
            collections = db_session.query(Collection).all()

            result = []
            for coll in collections:
                collection_data = {
                    "id": coll.id,
                    "name": coll.name,
                    "description": coll.description,
                    "created_at": coll.created_at.isoformat()
                    if coll.created_at
                    else None,
                    "collection_type": coll.collection_type,
                    "is_default": coll.is_default
                    if hasattr(coll, "is_default")
                    else False,
                    "document_count": len(coll.document_links)
                    if hasattr(coll, "document_links")
                    else 0,
                    "folder_count": len(coll.linked_folders)
                    if hasattr(coll, "linked_folders")
                    else 0,
                }

                # Include embedding metadata if available
                if coll.embedding_model:
                    collection_data["embedding"] = {
                        "model": coll.embedding_model,
                        "provider": coll.embedding_model_type.value
                        if coll.embedding_model_type
                        else None,
                        "dimension": coll.embedding_dimension,
                        "chunk_size": coll.chunk_size,
                        "chunk_overlap": coll.chunk_overlap,
                    }
                else:
                    collection_data["embedding"] = None

                result.append(collection_data)

        return jsonify({"success": True, "collections": result})
    except Exception as e:
        return handle_api_error("getting collections", e)


@rag_bp.route("/api/collections", methods=["POST"])
@login_required
def create_collection():
    """Create a new document collection."""
    from ...database.session_context import get_user_db_session

    try:
        data = request.get_json()
        name = data.get("name", "").strip()
        description = data.get("description", "").strip()
        collection_type = data.get("type", "user_uploads")

        if not name:
            return jsonify({"success": False, "error": "Name is required"}), 400

        username = session["username"]
        with get_user_db_session(username) as db_session:
            # Check if collection with this name already exists in this user's database
            existing = db_session.query(Collection).filter_by(name=name).first()

            if existing:
                return jsonify(
                    {
                        "success": False,
                        "error": f"Collection '{name}' already exists",
                    }
                ), 400

            # Create new collection (no username needed - each user has their own DB)
            # Note: created_at uses default=utcnow() in the model, so we don't need to set it manually
            collection = Collection(
                id=str(uuid.uuid4()),  # Generate UUID for collection
                name=name,
                description=description,
                collection_type=collection_type,
            )

            db_session.add(collection)
            db_session.commit()

            return jsonify(
                {
                    "success": True,
                    "collection": {
                        "id": collection.id,
                        "name": collection.name,
                        "description": collection.description,
                        "created_at": collection.created_at.isoformat(),
                        "collection_type": collection.collection_type,
                    },
                }
            )
    except Exception as e:
        return handle_api_error("creating collection", e)


@rag_bp.route("/api/collections/<string:collection_id>", methods=["PUT"])
@login_required
def update_collection(collection_id):
    """Update a collection's details."""
    from ...database.session_context import get_user_db_session

    try:
        data = request.get_json()
        name = data.get("name", "").strip()
        description = data.get("description", "").strip()

        username = session["username"]
        with get_user_db_session(username) as db_session:
            # No need to filter by username - each user has their own database
            collection = (
                db_session.query(Collection).filter_by(id=collection_id).first()
            )

            if not collection:
                return jsonify(
                    {"success": False, "error": "Collection not found"}
                ), 404

            if name:
                # Check if new name conflicts with existing collection
                existing = (
                    db_session.query(Collection)
                    .filter(
                        Collection.name == name,
                        Collection.id != collection_id,
                    )
                    .first()
                )

                if existing:
                    return jsonify(
                        {
                            "success": False,
                            "error": f"Collection '{name}' already exists",
                        }
                    ), 400

                collection.name = name

            if description is not None:  # Allow empty description
                collection.description = description

            db_session.commit()

            return jsonify(
                {
                    "success": True,
                    "collection": {
                        "id": collection.id,
                        "name": collection.name,
                        "description": collection.description,
                        "created_at": collection.created_at.isoformat()
                        if collection.created_at
                        else None,
                        "collection_type": collection.collection_type,
                    },
                }
            )
    except Exception as e:
        return handle_api_error("updating collection", e)


@rag_bp.route("/api/collections/<string:collection_id>", methods=["DELETE"])
@login_required
def delete_collection(collection_id):
    """Delete a collection and its orphaned documents."""
    from ..deletion.services.collection_deletion import (
        CollectionDeletionService,
    )

    try:
        username = session["username"]
        service = CollectionDeletionService(username)
        result = service.delete_collection(
            collection_id, delete_orphaned_documents=True
        )

        if result.get("deleted"):
            return jsonify(
                {
                    "success": True,
                    "message": "Collection deleted successfully",
                    "deleted_chunks": result.get("chunks_deleted", 0),
                    "orphaned_documents_deleted": result.get(
                        "orphaned_documents_deleted", 0
                    ),
                }
            )
        else:
            error = result.get("error", "Unknown error")
            status_code = 404 if "not found" in error.lower() else 400
            return jsonify({"success": False, "error": error}), status_code

    except Exception as e:
        return handle_api_error("deleting collection", e)


def extract_text_from_file(
    file_content: bytes, file_type: str, filename: str
) -> Optional[str]:
    """
    Extract text from uploaded file content.

    Args:
        file_content: Raw file content as bytes
        file_type: Type of file (pdf, text, markdown, html, etc.)
        filename: Original filename for logging

    Returns:
        Extracted text as string, or None if extraction failed
    """
    try:
        if file_type == "pdf":
            # Use the existing PDF text extraction functionality
            from ..downloaders.base import BaseDownloader

            text = BaseDownloader.extract_text_from_pdf(file_content)
            if text:
                logger.info(f"Successfully extracted text from PDF: {filename}")
                return text
            else:
                logger.warning(f"Could not extract text from PDF: {filename}")
                return None

        elif file_type in ["text", "txt", "markdown", "md"]:
            # Plain text files - just decode
            try:
                text = file_content.decode("utf-8")
                logger.info(f"Successfully read text file: {filename}")
                return text
            except UnicodeDecodeError:
                # Retry with errors='ignore' (same encoding, but skip invalid bytes)
                try:
                    text = file_content.decode("utf-8", errors="ignore")
                    logger.warning(
                        f"Read text file with encoding errors ignored: {filename}"
                    )
                    return text
                except Exception:
                    logger.exception(f"Failed to decode text file: {filename}")
                    return None

        elif file_type in ["html", "htm"]:
            # HTML files - extract text content
            try:
                from bs4 import BeautifulSoup

                html_content = file_content.decode("utf-8", errors="ignore")
                soup = BeautifulSoup(html_content, "html.parser")

                # Remove script and style elements
                for script in soup(["script", "style"]):
                    script.decompose()

                # Get text
                text = soup.get_text()

                # Clean up whitespace
                lines = (line.strip() for line in text.splitlines())
                chunks = (
                    phrase.strip()
                    for line in lines
                    for phrase in line.split("  ")
                )
                text = "\n".join(chunk for chunk in chunks if chunk)

                logger.info(
                    f"Successfully extracted text from HTML: {filename}"
                )
                return text
            except Exception:
                logger.exception(
                    f"Failed to extract text from HTML: {filename}"
                )
                return None

        elif file_type in ["docx", "doc"]:
            # Word documents - would need python-docx library
            logger.warning(
                f"Word document extraction not yet implemented: {filename}"
            )
            return None

        else:
            logger.warning(
                f"Unsupported file type for text extraction: {file_type} ({filename})"
            )
            return None

    except Exception:
        logger.exception(f"Error extracting text from {filename}")
        return None


@rag_bp.route(
    "/api/collections/<string:collection_id>/upload", methods=["POST"]
)
@login_required
def upload_to_collection(collection_id):
    """Upload files to a collection."""
    from ...database.session_context import get_user_db_session
    from werkzeug.utils import secure_filename
    from pathlib import Path
    import hashlib
    import uuid
    from ..services.pdf_storage_manager import PDFStorageManager

    try:
        if "files" not in request.files:
            return jsonify(
                {"success": False, "error": "No files provided"}
            ), 400

        files = request.files.getlist("files")
        if not files:
            return jsonify(
                {"success": False, "error": "No files selected"}
            ), 400

        username = session["username"]
        with get_user_db_session(username) as db_session:
            # Verify collection exists in this user's database
            collection = (
                db_session.query(Collection).filter_by(id=collection_id).first()
            )

            if not collection:
                return jsonify(
                    {"success": False, "error": "Collection not found"}
                ), 404

            # Get PDF storage mode from form data, falling back to user's setting
            settings = get_settings_manager()
            default_pdf_storage = settings.get_setting(
                "research_library.upload_pdf_storage", "none"
            )
            pdf_storage = request.form.get("pdf_storage", default_pdf_storage)
            if pdf_storage not in ("database", "none"):
                # Security: user uploads can only use database (encrypted) or none (text-only)
                # Filesystem storage is not allowed for user uploads
                pdf_storage = "none"

            # Initialize PDF storage manager if storing PDFs in database
            pdf_storage_manager = None
            if pdf_storage == "database":
                library_root = settings.get_setting(
                    "research_library.storage_path",
                    "~/.local-deep-research/library",
                )
                library_root = str(Path(library_root).expanduser())
                pdf_storage_manager = PDFStorageManager(
                    library_root=Path(library_root), storage_mode="database"
                )
                logger.info("PDF storage mode: database (encrypted)")
            else:
                logger.info("PDF storage mode: none (text-only)")

            uploaded_files = []
            errors = []

            for file in files:
                if not file.filename:
                    continue

                try:
                    # Read file content
                    file_content = file.read()
                    file.seek(0)  # Reset for potential re-reading

                    # Calculate file hash for deduplication
                    file_hash = hashlib.sha256(file_content).hexdigest()

                    # Check if document already exists
                    existing_doc = (
                        db_session.query(Document)
                        .filter_by(document_hash=file_hash)
                        .first()
                    )

                    if existing_doc:
                        # Document exists, check if we can upgrade to include PDF
                        pdf_upgraded = False
                        if (
                            pdf_storage == "database"
                            and pdf_storage_manager is not None
                        ):
                            pdf_upgraded = pdf_storage_manager.upgrade_to_pdf(
                                document=existing_doc,
                                pdf_content=file_content,
                                session=db_session,
                            )

                        # Check if already in collection
                        existing_link = (
                            db_session.query(DocumentCollection)
                            .filter_by(
                                document_id=existing_doc.id,
                                collection_id=collection_id,
                            )
                            .first()
                        )

                        if not existing_link:
                            # Add to collection
                            collection_link = DocumentCollection(
                                document_id=existing_doc.id,
                                collection_id=collection_id,
                                indexed=False,
                                chunk_count=0,
                            )
                            db_session.add(collection_link)
                            status = "added_to_collection"
                            if pdf_upgraded:
                                status = "added_to_collection_pdf_upgraded"
                            uploaded_files.append(
                                {
                                    "filename": existing_doc.filename,
                                    "status": status,
                                    "id": existing_doc.id,
                                    "pdf_upgraded": pdf_upgraded,
                                }
                            )
                        else:
                            status = "already_in_collection"
                            if pdf_upgraded:
                                status = "pdf_upgraded"
                            uploaded_files.append(
                                {
                                    "filename": existing_doc.filename,
                                    "status": status,
                                    "id": existing_doc.id,
                                    "pdf_upgraded": pdf_upgraded,
                                }
                            )
                    else:
                        # Create new document
                        filename = secure_filename(file.filename)

                        # Determine file type
                        file_extension = Path(filename).suffix.lower()
                        file_type = {
                            ".pdf": "pdf",
                            ".txt": "text",
                            ".md": "markdown",
                            ".html": "html",
                            ".htm": "html",
                            ".docx": "docx",
                            ".doc": "doc",
                        }.get(file_extension, "unknown")

                        # Extract text from the file
                        extracted_text = extract_text_from_file(
                            file_content, file_type, filename
                        )

                        # Clean the extracted text to remove surrogate characters
                        if extracted_text:
                            from ...text_processing import remove_surrogates

                            extracted_text = remove_surrogates(extracted_text)

                        if not extracted_text:
                            errors.append(
                                {
                                    "filename": filename,
                                    "error": f"Could not extract text from {file_type} file",
                                }
                            )
                            logger.warning(
                                f"Skipping file {filename} - no text could be extracted"
                            )
                            continue

                        # Get or create the user_upload source type
                        logger.info(
                            f"Getting or creating user_upload source type for {filename}"
                        )
                        source_type = (
                            db_session.query(SourceType)
                            .filter_by(name="user_upload")
                            .first()
                        )
                        if not source_type:
                            logger.info("Creating new user_upload source type")
                            source_type = SourceType(
                                id=str(uuid.uuid4()),
                                name="user_upload",
                                display_name="User Upload",
                                description="Documents uploaded by users",
                                icon="fas fa-upload",
                            )
                            db_session.add(source_type)
                            db_session.flush()
                            logger.info(
                                f"Created source type with ID: {source_type.id}"
                            )
                        else:
                            logger.info(
                                f"Found existing source type with ID: {source_type.id}"
                            )

                        # Create document with extracted text (no username needed - in user's own database)
                        # Note: uploaded_at uses default=utcnow() in the model, so we don't need to set it manually
                        doc_id = str(uuid.uuid4())
                        logger.info(
                            f"Creating document {doc_id} for {filename}"
                        )

                        # Determine storage mode and file_path
                        store_pdf_in_db = (
                            pdf_storage == "database"
                            and file_type == "pdf"
                            and pdf_storage_manager is not None
                        )

                        new_doc = Document(
                            id=doc_id,
                            source_type_id=source_type.id,
                            filename=filename,
                            document_hash=file_hash,
                            file_size=len(file_content),
                            file_type=file_type,
                            text_content=extracted_text,  # Always store extracted text
                            file_path=None
                            if store_pdf_in_db
                            else "text_only_not_stored",
                            storage_mode="database"
                            if store_pdf_in_db
                            else "none",
                        )
                        db_session.add(new_doc)
                        db_session.flush()  # Get the ID
                        logger.info(
                            f"Document {new_doc.id} created successfully"
                        )

                        # Store PDF in encrypted database if requested
                        pdf_stored = False
                        if store_pdf_in_db:
                            try:
                                pdf_storage_manager.save_pdf(
                                    pdf_content=file_content,
                                    document=new_doc,
                                    session=db_session,
                                    filename=filename,
                                )
                                pdf_stored = True
                                logger.info(
                                    f"PDF stored in encrypted database for {filename}"
                                )
                            except Exception:
                                logger.exception(
                                    f"Failed to store PDF in database for {filename}"
                                )
                                # Continue without PDF storage - text is still saved

                        # Add to collection
                        collection_link = DocumentCollection(
                            document_id=new_doc.id,
                            collection_id=collection_id,
                            indexed=False,
                            chunk_count=0,
                        )
                        db_session.add(collection_link)

                        uploaded_files.append(
                            {
                                "filename": filename,
                                "status": "uploaded",
                                "id": new_doc.id,
                                "text_length": len(extracted_text),
                                "pdf_stored": pdf_stored,
                            }
                        )

                except Exception:
                    errors.append(
                        {
                            "filename": file.filename,
                            "error": "Failed to upload file",
                        }
                    )
                    logger.exception(f"Error uploading file {file.filename}")

            db_session.commit()

            return jsonify(
                {
                    "success": True,
                    "uploaded": uploaded_files,
                    "errors": errors,
                    "summary": {
                        "total": len(files),
                        "successful": len(uploaded_files),
                        "failed": len(errors),
                    },
                }
            )

    except Exception as e:
        return handle_api_error("uploading files", e)


@rag_bp.route(
    "/api/collections/<string:collection_id>/documents", methods=["GET"]
)
@login_required
def get_collection_documents(collection_id):
    """Get all documents in a collection."""
    from ...database.session_context import get_user_db_session

    try:
        username = session["username"]
        with get_user_db_session(username) as db_session:
            # Verify collection exists in this user's database
            collection = (
                db_session.query(Collection).filter_by(id=collection_id).first()
            )

            if not collection:
                return jsonify(
                    {"success": False, "error": "Collection not found"}
                ), 404

            # Get documents through junction table
            doc_links = (
                db_session.query(DocumentCollection, Document)
                .join(Document)
                .filter(DocumentCollection.collection_id == collection_id)
                .all()
            )

            documents = []
            for link, doc in doc_links:
                # Check if PDF file is stored
                has_pdf = bool(
                    doc.file_path
                    and doc.file_path != "metadata_only"
                    and doc.file_path != "text_only_not_stored"
                )
                has_text_db = bool(doc.text_content)

                # Use title if available, otherwise filename
                display_title = doc.title or doc.filename or "Untitled"

                # Get source type name
                source_type_name = (
                    doc.source_type.name if doc.source_type else "unknown"
                )

                # Check if document is in other collections
                other_collections_count = (
                    db_session.query(DocumentCollection)
                    .filter(
                        DocumentCollection.document_id == doc.id,
                        DocumentCollection.collection_id != collection_id,
                    )
                    .count()
                )

                documents.append(
                    {
                        "id": doc.id,
                        "filename": display_title,
                        "title": display_title,
                        "file_type": doc.file_type,
                        "file_size": doc.file_size,
                        "uploaded_at": doc.created_at.isoformat()
                        if doc.created_at
                        else None,
                        "indexed": link.indexed,
                        "chunk_count": link.chunk_count,
                        "last_indexed_at": link.last_indexed_at.isoformat()
                        if link.last_indexed_at
                        else None,
                        "has_pdf": has_pdf,
                        "has_text_db": has_text_db,
                        "source_type": source_type_name,
                        "in_other_collections": other_collections_count > 0,
                        "other_collections_count": other_collections_count,
                    }
                )

            # Get index file size if available
            index_file_size = None
            index_file_size_bytes = None
            collection_name = f"collection_{collection_id}"
            rag_index = (
                db_session.query(RAGIndex)
                .filter_by(collection_name=collection_name)
                .first()
            )
            if rag_index and rag_index.index_path:
                from pathlib import Path

                index_path = Path(rag_index.index_path)
                if index_path.exists():
                    size_bytes = index_path.stat().st_size
                    index_file_size_bytes = size_bytes
                    # Format as human-readable
                    if size_bytes < 1024:
                        index_file_size = f"{size_bytes} B"
                    elif size_bytes < 1024 * 1024:
                        index_file_size = f"{size_bytes / 1024:.1f} KB"
                    else:
                        index_file_size = f"{size_bytes / (1024 * 1024):.1f} MB"

            return jsonify(
                {
                    "success": True,
                    "collection": {
                        "id": collection.id,
                        "name": collection.name,
                        "description": collection.description,
                        "embedding_model": collection.embedding_model,
                        "embedding_model_type": collection.embedding_model_type.value
                        if collection.embedding_model_type
                        else None,
                        "embedding_dimension": collection.embedding_dimension,
                        "chunk_size": collection.chunk_size,
                        "chunk_overlap": collection.chunk_overlap,
                        # Advanced settings
                        "splitter_type": collection.splitter_type,
                        "distance_metric": collection.distance_metric,
                        "index_type": collection.index_type,
                        "normalize_vectors": collection.normalize_vectors,
                        # Index file info
                        "index_file_size": index_file_size,
                        "index_file_size_bytes": index_file_size_bytes,
                    },
                    "documents": documents,
                }
            )

    except Exception as e:
        return handle_api_error("getting collection documents", e)


@rag_bp.route("/api/collections/<string:collection_id>/index", methods=["GET"])
@login_required
def index_collection(collection_id):
    """Index all documents in a collection with Server-Sent Events progress."""
    from ...database.session_context import get_user_db_session
    from ...database.session_passwords import session_password_store

    force_reindex = request.args.get("force_reindex", "false").lower() == "true"
    username = session["username"]
    session_id = session.get("session_id")

    logger.info(f"Starting index_collection, force_reindex={force_reindex}")

    # Get password for thread access to encrypted database
    db_password = None
    if session_id:
        db_password = session_password_store.get_session_password(
            username, session_id
        )

    # Create RAG service
    rag_service = get_rag_service(collection_id)
    # Set password for thread access
    rag_service.db_password = db_password
    logger.info(
        f"RAG service created: provider={rag_service.embedding_provider}"
    )

    def generate():
        """Generator for SSE progress updates."""
        logger.info("SSE generator started")
        try:
            with get_user_db_session(username, db_password) as db_session:
                # Verify collection exists in this user's database
                collection = (
                    db_session.query(Collection)
                    .filter_by(id=collection_id)
                    .first()
                )

                if not collection:
                    yield f"data: {json.dumps({'type': 'error', 'error': 'Collection not found'})}\n\n"
                    return

                # Store embedding metadata if this is the first time indexing
                if collection.embedding_model is None:
                    # Get embedding dimension from the embedding manager
                    embedding_dim = None
                    try:
                        # Try to get dimension from the embedding manager's provider
                        if hasattr(rag_service.embedding_manager, "provider"):
                            provider = rag_service.embedding_manager.provider
                            if hasattr(provider, "embedding_dimension"):
                                embedding_dim = provider.embedding_dimension
                    except Exception as e:
                        logger.warning(
                            f"Could not get embedding dimension: {e}"
                        )

                    # Store the embedding configuration used for this collection
                    collection.embedding_model = rag_service.embedding_model
                    collection.embedding_model_type = EmbeddingProvider(
                        rag_service.embedding_provider
                    )
                    collection.embedding_dimension = embedding_dim
                    collection.chunk_size = rag_service.chunk_size
                    collection.chunk_overlap = rag_service.chunk_overlap
                    # Store advanced settings
                    collection.splitter_type = rag_service.splitter_type
                    collection.text_separators = rag_service.text_separators
                    collection.distance_metric = rag_service.distance_metric
                    # Ensure normalize_vectors is a proper boolean for database
                    collection.normalize_vectors = bool(
                        rag_service.normalize_vectors
                    )
                    collection.index_type = rag_service.index_type
                    db_session.commit()
                    logger.info(
                        f"Stored embedding metadata for collection: provider={rag_service.embedding_provider}"
                    )

                # Get documents to index
                query = (
                    db_session.query(DocumentCollection, Document)
                    .join(Document)
                    .filter(DocumentCollection.collection_id == collection_id)
                )

                if not force_reindex:
                    # IMPORTANT: DO NOT CHANGE THIS LINE! DO NOT REMOVE THIS COMMENT!
                    # We MUST use `== False` instead of `not DocumentCollection.indexed`
                    # The Python `not` operator does NOT work correctly in SQLAlchemy filters.
                    # Using `not` will cause the query to return NO results (zero documents).
                    # SQLAlchemy requires explicit comparison: `== False` or `== True`
                    # This has been fixed multiple times - DO NOT change it back to `not`!
                    query = query.filter(DocumentCollection.indexed == False)  # noqa: E712

                doc_links = query.all()

                if not doc_links:
                    logger.info("No documents to index in collection")
                    yield f"data: {json.dumps({'type': 'complete', 'results': {'successful': 0, 'skipped': 0, 'failed': 0, 'message': 'No documents to index'}})}\n\n"
                    return

                total = len(doc_links)
                logger.info(f"Found {total} documents to index")
                results = {
                    "successful": 0,
                    "skipped": 0,
                    "failed": 0,
                    "errors": [],
                }

                yield f"data: {json.dumps({'type': 'start', 'message': f'Indexing {total} documents in collection: {collection.name}'})}\n\n"

                for idx, (link, doc) in enumerate(doc_links, 1):
                    filename = doc.filename or doc.title or "Unknown"
                    yield f"data: {json.dumps({'type': 'progress', 'current': idx, 'total': total, 'filename': filename, 'percent': int((idx / total) * 100)})}\n\n"

                    try:
                        logger.debug(
                            f"Indexing document {idx}/{total}: {filename}"
                        )

                        # Run index_document in a separate thread to allow sending SSE heartbeats.
                        # This keeps the HTTP connection alive during long indexing operations,
                        # preventing timeouts from proxy servers (nginx) and browsers.
                        # The main thread periodically yields heartbeat comments while waiting.
                        result_queue = queue.Queue()
                        error_queue = queue.Queue()

                        def index_in_thread():
                            try:
                                r = rag_service.index_document(
                                    document_id=doc.id,
                                    collection_id=collection_id,
                                    force_reindex=force_reindex,
                                )
                                result_queue.put(r)
                            except Exception as ex:
                                error_queue.put(ex)

                        thread = threading.Thread(target=index_in_thread)
                        thread.start()

                        # Send heartbeats while waiting for the thread to complete
                        heartbeat_interval = 5  # seconds
                        while thread.is_alive():
                            thread.join(timeout=heartbeat_interval)
                            if thread.is_alive():
                                # Send SSE comment as heartbeat (keeps connection alive)
                                yield f": heartbeat {idx}/{total}\n\n"

                        # Check for errors from thread
                        if not error_queue.empty():
                            raise error_queue.get()

                        result = result_queue.get()
                        logger.info(
                            f"Indexed document {idx}/{total}: {filename} - status={result.get('status')}"
                        )

                        if result.get("status") == "success":
                            results["successful"] += 1
                            # DocumentCollection status is already updated in index_document
                            # No need to update link here
                        elif result.get("status") == "skipped":
                            results["skipped"] += 1
                        else:
                            results["failed"] += 1
                            error_msg = result.get("error", "Unknown error")
                            results["errors"].append(
                                {
                                    "filename": filename,
                                    "error": error_msg,
                                }
                            )
                            logger.warning(
                                f"Failed to index {filename} ({idx}/{total}): {error_msg}"
                            )
                    except Exception as e:
                        results["failed"] += 1
                        error_msg = str(e) or "Failed to index document"
                        results["errors"].append(
                            {
                                "filename": filename,
                                "error": error_msg,
                            }
                        )
                        logger.exception(
                            f"Exception indexing document {filename} ({idx}/{total})"
                        )
                        # Send error update to client so they know indexing is continuing
                        yield f"data: {json.dumps({'type': 'doc_error', 'filename': filename, 'error': error_msg})}\n\n"

                db_session.commit()
                # Ensure all changes are written to disk
                db_session.flush()

            logger.info(
                f"Indexing complete: {results['successful']} successful, {results['failed']} failed, {results['skipped']} skipped"
            )
            yield f"data: {json.dumps({'type': 'complete', 'results': results})}\n\n"
            logger.info("SSE generator finished successfully")

        except Exception:
            logger.exception("Error in collection indexing")
            yield f"data: {json.dumps({'type': 'error', 'error': 'An internal error occurred during indexing'})}\n\n"

    response = Response(
        stream_with_context(generate()), mimetype="text/event-stream"
    )
    # Prevent buffering for proper SSE streaming
    response.headers["Cache-Control"] = "no-cache, no-transform"
    response.headers["Connection"] = "keep-alive"
    response.headers["X-Accel-Buffering"] = "no"
    return response


# =============================================================================
# Background Indexing Endpoints
# =============================================================================


def _get_rag_service_for_thread(
    collection_id: str,
    username: str,
    db_password: str,
) -> LibraryRAGService:
    """
    Create RAG service for use in background threads (no Flask context).
    """
    from ...database.session_context import get_user_db_session
    from ...settings.manager import SettingsManager
    from ...web_search_engines.engines.search_engine_local import (
        LocalEmbeddingManager,
    )
    import json

    with get_user_db_session(username, db_password) as db_session:
        settings_manager = SettingsManager(db_session)

        # Get default settings
        default_embedding_model = settings_manager.get_setting(
            "local_search_embedding_model", "all-MiniLM-L6-v2"
        )
        default_embedding_provider = settings_manager.get_setting(
            "local_search_embedding_provider", "sentence_transformers"
        )
        default_chunk_size = int(
            settings_manager.get_setting("local_search_chunk_size", 1000)
        )
        default_chunk_overlap = int(
            settings_manager.get_setting("local_search_chunk_overlap", 200)
        )
        default_splitter_type = settings_manager.get_setting(
            "local_search_splitter_type", "recursive"
        )
        default_text_separators = settings_manager.get_setting(
            "local_search_text_separators", '["\n\n", "\n", ". ", " ", ""]'
        )
        if isinstance(default_text_separators, str):
            try:
                default_text_separators = json.loads(default_text_separators)
            except json.JSONDecodeError:
                logger.warning(
                    f"Invalid JSON for local_search_text_separators setting: {default_text_separators!r}. "
                    "Using default separators."
                )
                default_text_separators = ["\n\n", "\n", ". ", " ", ""]
        default_distance_metric = settings_manager.get_setting(
            "local_search_distance_metric", "cosine"
        )
        raw_normalize = settings_manager.get_setting(
            "local_search_normalize_vectors", True
        )
        if isinstance(raw_normalize, str):
            default_normalize_vectors = raw_normalize.lower() in (
                "true",
                "1",
                "yes",
            )
        else:
            default_normalize_vectors = bool(raw_normalize)
        default_index_type = settings_manager.get_setting(
            "local_search_index_type", "flat"
        )

        # Get settings snapshot for embedding manager
        settings_snapshot = settings_manager.get_settings_snapshot()
        settings_snapshot["_username"] = username

        # Check for collection's stored settings
        collection = (
            db_session.query(Collection).filter_by(id=collection_id).first()
        )

        if collection and collection.embedding_model:
            # Use collection's stored settings
            embedding_model = collection.embedding_model
            embedding_provider = collection.embedding_model_type.value
            chunk_size = collection.chunk_size or default_chunk_size
            chunk_overlap = collection.chunk_overlap or default_chunk_overlap
            splitter_type = collection.splitter_type or default_splitter_type
            text_separators = (
                collection.text_separators or default_text_separators
            )
            distance_metric = (
                collection.distance_metric or default_distance_metric
            )
            index_type = collection.index_type or default_index_type

            coll_normalize = collection.normalize_vectors
            if coll_normalize is not None:
                if isinstance(coll_normalize, str):
                    coll_normalize = coll_normalize.lower() in (
                        "true",
                        "1",
                        "yes",
                    )
                else:
                    coll_normalize = bool(coll_normalize)
            else:
                coll_normalize = default_normalize_vectors
            normalize_vectors = coll_normalize
        else:
            # Use default settings
            embedding_model = default_embedding_model
            embedding_provider = default_embedding_provider
            chunk_size = default_chunk_size
            chunk_overlap = default_chunk_overlap
            splitter_type = default_splitter_type
            text_separators = default_text_separators
            distance_metric = default_distance_metric
            normalize_vectors = default_normalize_vectors
            index_type = default_index_type

        # Update settings snapshot with embedding config
        settings_snapshot.update(
            {
                "embeddings.provider": embedding_provider,
                f"embeddings.{embedding_provider}.model": embedding_model,
                "local_search_chunk_size": chunk_size,
                "local_search_chunk_overlap": chunk_overlap,
            }
        )

        # Create embedding manager (to avoid database access in LibraryRAGService.__init__)
        embedding_manager = LocalEmbeddingManager(
            embedding_model=embedding_model,
            embedding_model_type=embedding_provider,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            settings_snapshot=settings_snapshot,
        )
        embedding_manager.db_password = db_password

        # Create RAG service with pre-built embedding manager and db_password
        rag_service = LibraryRAGService(
            username=username,
            embedding_model=embedding_model,
            embedding_provider=embedding_provider,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            splitter_type=splitter_type,
            text_separators=text_separators,
            distance_metric=distance_metric,
            normalize_vectors=normalize_vectors,
            index_type=index_type,
            embedding_manager=embedding_manager,
            db_password=db_password,
        )

    return rag_service


def _background_index_worker(
    task_id: str,
    collection_id: str,
    username: str,
    db_password: str,
    force_reindex: bool,
):
    """
    Background worker thread for indexing documents.
    Updates TaskMetadata with progress and checks for cancellation.
    """
    from ...database.session_context import get_user_db_session

    try:
        # Create RAG service (thread-safe, no Flask context needed)
        rag_service = _get_rag_service_for_thread(
            collection_id, username, db_password
        )

        with get_user_db_session(username, db_password) as db_session:
            # Get collection
            collection = (
                db_session.query(Collection).filter_by(id=collection_id).first()
            )

            if not collection:
                _update_task_status(
                    username,
                    db_password,
                    task_id,
                    status="failed",
                    error_message="Collection not found",
                )
                return

            # Store embedding metadata if first time
            if collection.embedding_model is None:
                collection.embedding_model = rag_service.embedding_model
                collection.embedding_model_type = EmbeddingProvider(
                    rag_service.embedding_provider
                )
                collection.chunk_size = rag_service.chunk_size
                collection.chunk_overlap = rag_service.chunk_overlap
                collection.splitter_type = rag_service.splitter_type
                collection.text_separators = rag_service.text_separators
                collection.distance_metric = rag_service.distance_metric
                collection.normalize_vectors = bool(
                    rag_service.normalize_vectors
                )
                collection.index_type = rag_service.index_type
                db_session.commit()

            # Get documents to index
            query = (
                db_session.query(DocumentCollection, Document)
                .join(Document)
                .filter(DocumentCollection.collection_id == collection_id)
            )

            if not force_reindex:
                query = query.filter(DocumentCollection.indexed == False)  # noqa: E712

            doc_links = query.all()

            if not doc_links:
                _update_task_status(
                    username,
                    db_password,
                    task_id,
                    status="completed",
                    progress_message="No documents to index",
                )
                return

            total = len(doc_links)
            results = {"successful": 0, "skipped": 0, "failed": 0}

            # Update task with total count
            _update_task_status(
                username,
                db_password,
                task_id,
                progress_total=total,
                progress_message=f"Indexing {total} documents",
            )

            for idx, (link, doc) in enumerate(doc_links, 1):
                # Check if cancelled
                if _is_task_cancelled(username, db_password, task_id):
                    _update_task_status(
                        username,
                        db_password,
                        task_id,
                        status="cancelled",
                        progress_message=f"Cancelled after {idx - 1}/{total} documents",
                    )
                    logger.info(f"Indexing task {task_id} was cancelled")
                    return

                filename = doc.filename or doc.title or "Unknown"

                # Update progress with filename
                _update_task_status(
                    username,
                    db_password,
                    task_id,
                    progress_current=idx,
                    progress_message=f"Indexing {idx}/{total}: {filename}",
                )

                try:
                    result = rag_service.index_document(
                        document_id=doc.id,
                        collection_id=collection_id,
                        force_reindex=force_reindex,
                    )

                    if result.get("status") == "success":
                        results["successful"] += 1
                    elif result.get("status") == "skipped":
                        results["skipped"] += 1
                    else:
                        results["failed"] += 1

                except Exception:
                    results["failed"] += 1
                    logger.exception(f"Error indexing document {idx}/{total}")

            db_session.commit()

        # Mark as completed
        _update_task_status(
            username,
            db_password,
            task_id,
            status="completed",
            progress_current=total,
            progress_message=f"Completed: {results['successful']} indexed, {results['failed']} failed, {results['skipped']} skipped",
        )
        logger.info(f"Background indexing task {task_id} completed: {results}")

    except Exception as e:
        logger.exception(f"Background indexing task {task_id} failed")
        _update_task_status(
            username,
            db_password,
            task_id,
            status="failed",
            error_message=str(e),
        )


def _update_task_status(
    username: str,
    db_password: str,
    task_id: str,
    status: str = None,
    progress_current: int = None,
    progress_total: int = None,
    progress_message: str = None,
    error_message: str = None,
):
    """Update task metadata in the database."""
    from ...database.session_context import get_user_db_session

    try:
        with get_user_db_session(username, db_password) as db_session:
            task = (
                db_session.query(TaskMetadata)
                .filter_by(task_id=task_id)
                .first()
            )
            if task:
                if status is not None:
                    task.status = status
                    if status == "completed":
                        task.completed_at = datetime.now(UTC)
                if progress_current is not None:
                    task.progress_current = progress_current
                if progress_total is not None:
                    task.progress_total = progress_total
                if progress_message is not None:
                    task.progress_message = progress_message
                if error_message is not None:
                    task.error_message = error_message
                db_session.commit()
    except Exception:
        logger.exception(f"Failed to update task status for {task_id}")


def _is_task_cancelled(username: str, db_password: str, task_id: str) -> bool:
    """Check if a task has been cancelled."""
    from ...database.session_context import get_user_db_session

    try:
        with get_user_db_session(username, db_password) as db_session:
            task = (
                db_session.query(TaskMetadata)
                .filter_by(task_id=task_id)
                .first()
            )
            return task and task.status == "cancelled"
    except Exception:
        return False


@rag_bp.route(
    "/api/collections/<string:collection_id>/index/start", methods=["POST"]
)
@login_required
def start_background_index(collection_id):
    """Start background indexing for a collection."""
    from ...database.session_context import get_user_db_session
    from ...database.session_passwords import session_password_store

    username = session["username"]
    session_id = session.get("session_id")

    # Get password for thread access
    db_password = None
    if session_id:
        db_password = session_password_store.get_session_password(
            username, session_id
        )

    # Parse request body
    data = request.get_json() or {}
    force_reindex = data.get("force_reindex", False)

    try:
        with get_user_db_session(username, db_password) as db_session:
            # Check if there's already an active indexing task for this collection
            existing_task = (
                db_session.query(TaskMetadata)
                .filter(
                    TaskMetadata.task_type == "indexing",
                    TaskMetadata.status == "processing",
                )
                .first()
            )

            if existing_task:
                # Check if it's for this collection
                metadata = existing_task.metadata_json or {}
                if metadata.get("collection_id") == collection_id:
                    return jsonify(
                        {
                            "success": False,
                            "error": "Indexing is already in progress for this collection",
                            "task_id": existing_task.task_id,
                        }
                    ), 409

            # Create new task
            task_id = str(uuid.uuid4())
            task = TaskMetadata(
                task_id=task_id,
                status="processing",
                task_type="indexing",
                created_at=datetime.now(UTC),
                started_at=datetime.now(UTC),
                progress_current=0,
                progress_total=0,
                progress_message="Starting indexing...",
                metadata_json={
                    "collection_id": collection_id,
                    "force_reindex": force_reindex,
                },
            )
            db_session.add(task)
            db_session.commit()

        # Start background thread
        thread = threading.Thread(
            target=_background_index_worker,
            args=(task_id, collection_id, username, db_password, force_reindex),
            daemon=True,
        )
        thread.start()

        logger.info(
            f"Started background indexing task {task_id} for collection {collection_id}"
        )

        return jsonify(
            {
                "success": True,
                "task_id": task_id,
                "message": "Indexing started in background",
            }
        )

    except Exception:
        logger.exception("Failed to start background indexing")
        return jsonify(
            {
                "success": False,
                "error": "Failed to start indexing. Please try again.",
            }
        ), 500


@rag_bp.route(
    "/api/collections/<string:collection_id>/index/status", methods=["GET"]
)
@limiter.exempt
@login_required
def get_index_status(collection_id):
    """Get the current indexing status for a collection."""
    from ...database.session_context import get_user_db_session
    from ...database.session_passwords import session_password_store

    username = session["username"]
    session_id = session.get("session_id")

    db_password = None
    if session_id:
        db_password = session_password_store.get_session_password(
            username, session_id
        )

    try:
        with get_user_db_session(username, db_password) as db_session:
            # Find the most recent indexing task for this collection
            task = (
                db_session.query(TaskMetadata)
                .filter(TaskMetadata.task_type == "indexing")
                .order_by(TaskMetadata.created_at.desc())
                .first()
            )

            if not task:
                return jsonify(
                    {
                        "status": "idle",
                        "message": "No indexing task found",
                    }
                )

            # Check if it's for this collection
            metadata = task.metadata_json or {}
            if metadata.get("collection_id") != collection_id:
                return jsonify(
                    {
                        "status": "idle",
                        "message": "No indexing task for this collection",
                    }
                )

            return jsonify(
                {
                    "task_id": task.task_id,
                    "status": task.status,
                    "progress_current": task.progress_current or 0,
                    "progress_total": task.progress_total or 0,
                    "progress_message": task.progress_message,
                    "error_message": task.error_message,
                    "created_at": task.created_at.isoformat()
                    if task.created_at
                    else None,
                    "completed_at": task.completed_at.isoformat()
                    if task.completed_at
                    else None,
                }
            )

    except Exception:
        logger.exception("Failed to get index status")
        return jsonify(
            {
                "status": "error",
                "error": "Failed to get indexing status. Please try again.",
            }
        ), 500


@rag_bp.route(
    "/api/collections/<string:collection_id>/index/cancel", methods=["POST"]
)
@login_required
def cancel_indexing(collection_id):
    """Cancel an active indexing task for a collection."""
    from ...database.session_context import get_user_db_session
    from ...database.session_passwords import session_password_store

    username = session["username"]
    session_id = session.get("session_id")

    db_password = None
    if session_id:
        db_password = session_password_store.get_session_password(
            username, session_id
        )

    try:
        with get_user_db_session(username, db_password) as db_session:
            # Find active indexing task for this collection
            task = (
                db_session.query(TaskMetadata)
                .filter(
                    TaskMetadata.task_type == "indexing",
                    TaskMetadata.status == "processing",
                )
                .first()
            )

            if not task:
                return jsonify(
                    {
                        "success": False,
                        "error": "No active indexing task found",
                    }
                ), 404

            # Check if it's for this collection
            metadata = task.metadata_json or {}
            if metadata.get("collection_id") != collection_id:
                return jsonify(
                    {
                        "success": False,
                        "error": "No active indexing task for this collection",
                    }
                ), 404

            # Mark as cancelled - the worker thread will check this
            task.status = "cancelled"
            task.progress_message = "Cancellation requested..."
            db_session.commit()

            logger.info(
                f"Cancelled indexing task {task.task_id} for collection {collection_id}"
            )

            return jsonify(
                {
                    "success": True,
                    "message": "Cancellation requested",
                    "task_id": task.task_id,
                }
            )

    except Exception:
        logger.exception("Failed to cancel indexing")
        return jsonify(
            {
                "success": False,
                "error": "Failed to cancel indexing. Please try again.",
            }
        ), 500
